
import logging
from init import PATHS
LOGGER = logging.getLogger(__name__)
import os
import pandas as pd
import numpy as np
from C_PatentVariables import collecting_patstat, family_aggregation, firm_year_aggregation


def get_list_of_patent_ids():
    LOGGER.info('get_list_of_patent_ids')
    # IMPORT BVDIDS OF SUPPLIERS OF OEMS
    Suppliers = pd.read_csv(PATHS.dropbox / 'Data_outputted/B_FactsetVariables/suppliers_ids.csv')
    list_bvdids = Suppliers['bvdid'].unique()
    # IMPORT PATSTAT ORBIS CORRESPONDENCE
    PatstatLinks = pd.read_csv(PATHS.privatedata / 'Patstat_orbis_correspondence/Orbis_PATSTAT_updatePEM_anonymized.csv')
    PatstatLinks = PatstatLinks[PatstatLinks['bvdid'].isin(list_bvdids)]
    list_appln_id = PatstatLinks['appln_id'].drop_duplicates().tolist()
    list_docdb_id = PatstatLinks['docdb_family_id'].drop_duplicates().tolist()
    LOGGER.info('Suppliers: Nbr appln_id: {}'.format(len(list_appln_id)))
    LOGGER.info('Supliers: Nbr docdb_family_id: {}'.format(len(list_docdb_id)))
    return list_appln_id, list_docdb_id


def main(test=False):
    LOGGER.info('SCRIPT: c_patent_suppliers.py')
    # COLLECTING PATENTS
    LOGGER.info("       Collect info about suppliers patents")
    LOGGER.info("       test: {}".format(test))
    list_appln_id, list_docdb_id = get_list_of_patent_ids()
    # Just to be sure, let's do a first pass through patstat to collect all the docdb ids corresponding to the appln_ids I have
    list_appln_id, list_docdb_id = collecting_patstat.collecting_ids(list_appln_id, list_docdb_id, test)
    namefile = 'suppliers'
    LOGGER.info("       Collect info at application level")
    collecting_patstat.collecting_info_on_applnids(list_appln_id, list_docdb_id, namefile, test, small=True)
    LOGGER.info("       Aggregating suppliers patents at family level")
    family_aggregation.aggregate_at_familylevel(namefile)
    LOGGER.info("       Collect all ipc and cpc codes of suppliers patents and aggregate them at family level")
    collecting_patstat.extract_and_save_cpc_ipc_codes(namefile, test)
    LOGGER.info("       Collect naces codes associated to suppliers patents and aggregate them at family level")
    collecting_patstat.extract_and_save_nace_codes(namefile, test)
    LOGGER.info("       Collect psn_sector of applicants associated to patents and aggregate them at family level")
    collecting_patstat.extract_and_save_psnsector(list_appln_id, namefile, test)
    # AGGREGATING at supplier level
    LOGGER.info("       AGGREGATION FIRM-YEAR")
    # Import docdbids - bvdids info
    df_bvdid_docdbid = firm_year_aggregation.get_docdbids_of_bvdids('suppliers')
    df_agg_corres = df_bvdid_docdbid.copy(deep=True)  # if I don't include copy deep true, the two df remain connected.
    df_agg_corres['Year'] = df_agg_corres['earliest_filing_year']
    # Chunk df_agg_corres because otherwise the step where we calculate cumulative stock is too demanding ram-wise
    listbvdids = df_agg_corres['bvdid'].sort_values().unique().tolist()
    # NB: I sort_values() so that bvdids are ordered alphabetically
    # this reduces the probability that two huge massive firms are next to each other in the list
    # which makes the chunks more balanced
    n = 300
    chunks = [listbvdids[i:i + n] for i in range(0, len(listbvdids), n)]
    l = [df_agg_corres[df_agg_corres['bvdid'].isin(chunks[k])].shape[0] for k in range(5)]
    LOGGER.info(f'Nbr of firms per chunks: {n}. Nbr of chunks: {len(chunks)}')
    LOGGER.info(f'Nbr of obser (families) in each chunk: {l}')
    df_firmyear = []
    for i in range(len(chunks)):
        LOGGER.info(f'Chunk {i} / {len(chunks)}')
        df_agg_corres_chunk = df_agg_corres[df_agg_corres['bvdid'].isin(chunks[i])]
        firm_year_aggregation.main(df_agg_corres_chunk, df_bvdid_docdbid, aggregation_level='bvdid', namefile=f'suppliers{i}')
        df_firmyear.append(pd.read_csv(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Panel_suppliers{i}_FamInfo.csv'))
    df_firmyear = pd.concat(df_firmyear)
    df_firmyear.to_csv(PATHS.dropbox / 'Data_outputted/C_PatentVariables/Panel_suppliers_FamInfo.csv', index=False)
    for i in range(len(chunks)):
        os.remove(PATHS.dropbox / f'Data_outputted/C_PatentVariables/Panel_suppliers{i}_FamInfo.csv')
    LOGGER.info('SCRIPT END: c_patent_suppliers.py')

